#!/usr/bin/perl
use strict;
use List::Util qw(min max);
use Bio::AlignIO;
use Bio::Tools::GFF;
use Bio::SeqFeatureI;
$|=1;

sub cleanChrName {
	my $chr = shift(@_);
	chomp($chr);
	if(substr($chr,0,1) eq ">") {
		$chr = substr($chr,1);
	}
	if(index($chr, " ")!= -1 || index($chr, "\t") != -1) {
		$chr = (split(' ', $chr))[0];
	}
	return "C" . $chr;
}


my $RUN_MODE = "mask";

if( @ARGV < 3) {
   print "buildRegulatorySeqDB <upstream|downstream> <FastaFile> <GFFFile> <Length> <OutputFastaFile> [TE Annotation GFF File]\n";
   exit();
}
my $seq;
my $seq_len = $ARGV[3];
my $mode = $ARGV[0];
my $teAnnotationFileName = $ARGV[5];
my $gffFileName = $ARGV[2];


open (my $genomeFastaFile, "gunzip -c $ARGV[1] |") || die "can`t open file $ARGV[0]";
open (my $gfffile, $gffFileName) || die "can't open file $gffFileName";
open (my $output, ">", "$ARGV[4]");

# read genomes

my %chromosomes;

my $curChr = "";

while(my $curLine = <$genomeFastaFile>) {
	chomp($curLine);	
	if(substr($curLine,0,1) eq ">") {
		$curChr = cleanChrName($curLine);
	} else {
		$chromosomes{$curChr} .= uc($curLine);
	}
}

close($genomeFastaFile);

#### Filter TEs, if required
if(-e $teAnnotationFileName) {
	my $gff = Bio::Tools::GFF->new(-file => $teAnnotationFileName,
				   -gff_version => 3);
	while(my $curTE = $gff->next_feature) {
		my $chromosome = cleanChrName ($curTE->seq_id);
		if(defined $chromosomes{$chromosome}) {
			substr($chromosomes{$chromosome}, $curTE->start, $curTE->end - $curTE->start) = "X" x ($curTE->end - $curTE->start);
		}
	}
}

my $gff = Bio::Tools::GFF->new(-file => $gffFileName,
			   -gff_version => 3);
while(my $curGene = $gff->next_feature) {
	my $chromosome = cleanChrName ($curGene->seq_id);
	if(defined $chromosomes{$chromosome}) {
#			print $chromosome . "-" . $curGene->start . " -- Length: " . length($chromosomes[$chromosome_num]) . "\n";
		### Make sure no error in annotation placed a gene outside contig limits
		if($curGene->end < length($chromosomes{$chromosome} )) {
			substr($chromosomes{$chromosome}, $curGene->start-1, $curGene->end - $curGene->start) = "X" x ($curGene->end - $curGene->start);
		}
	}
}

my $lastGeneEnd_pos=0;
my $lastChromosome=0;

if($mode eq "upstream") {
	while(my $gffline = <$gfffile>)	{
		if(substr($gffline, 1,1) ne "#") {
		chomp($gffline);
		my @genefields = split(/\t/, $gffline);
		my %geneinfo = split /[;=]/, $genefields[8];
		my $genename = $geneinfo{"Name"};
								
		if($genefields[2] eq "Footprint") {
			my $chromosome = cleanChrName($genefields[0]);
			if(defined $chromosomes{$chromosome}) {
				my $dir = $genefields[6];
  		
				print $output ">". $genename . "\n";
				my $prom;

				if($dir eq "+") {
					my $loc = $genefields[3];
					my $pstart = $loc - $seq_len;
				
					if($pstart <= 0) { ## If this is at the start of the chromosome and we don't have enough upstream seq						
						$prom = substr($chromosomes{$chromosome}, 0, $loc-1)
					} else {
						$prom = substr($chromosomes{$chromosome}, $pstart-1, $seq_len);
					}
				} else {
					my $loc = $genefields[4]; 
					$prom = substr($chromosomes{$chromosome}, $loc, $seq_len);
					# reverse complement sequence
					$prom = reverse $prom;
					$prom =~ tr/ACGT/TGCA/;
				}
				print  $output ('N' x ($seq_len - length($prom)) . $prom . "\n");
			} else {
				print $output ">". $genename . "\n" . ('N' x $seq_len) . "\n"; #If the chromosome is not annotated, print Ns; 
			}
		}
	  }
	}
} elsif($mode eq "downstream") {
	my $curseq_len;
	while(my $gffline = <$gfffile>)	{
		if(substr($gffline, 1,1) ne "#") {
			chomp($gffline);
			my @genefields = split(/\t/, $gffline);
			my %geneinfo = split /[;=]/, $genefields[8];
			my $genename = $geneinfo{"Name"};
				
			if($genefields[2] eq "Footprint") {
				my $chromosome = cleanChrName($genefields[0]);
				if(defined $chromosomes{$chromosome}) {
					my $dir = $genefields[6];

  					print $output ">". $genename . "\n";
  					my $prom;

  					if($dir eq "+") {
  						my $pstart = $genefields[4]+1;
  						$prom = substr($chromosomes{$chromosome}, $pstart-1, $seq_len);
				} else {
					my $pstart = $genefields[3]-$seq_len;
					my $curseq_len = $seq_len;
					my $geneend = $genefields[3];
					if ($pstart <0 ){
						$pstart=0;
						$curseq_len = $geneend;
					}
					$prom = substr($chromosomes{$chromosome}, $pstart-1, $curseq_len);
					# reverse complement sequence
					$prom = reverse $prom;
					$prom =~ tr/ACGT/TGCA/;
				}
				print $output ($prom . ('N' x ($seq_len - length($prom))) . "\n");
			} else {
				print $output ">". $genename . "\n" . ('N' x $seq_len) . "\n"; #If the chromosome is not annotated, print Ns; 
			}
		}
	 }
	}
}	
